iT邦幫忙

2025 iThome 鐵人賽

DAY 11
0
生成式 AI

練習AI系列 第 12

文件摘要器(PDF/TXT/MD/DOCX → 結構化摘要)

  • 分享至 

  • xImage
  •  

📦 安裝相依套件(新增)
npm i pdf-parse mammoth

🆕 程式碼

  1. src/utils/io.js(新增)
    // src/utils/io.js
    import fs from "fs";
    import path from "path";
    import pdfParse from "pdf-parse";
    import mammoth from "mammoth";

/** 建立資料夾 */
export function ensureDir(dir) {
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
}

/** 由 Content-Type 推測副檔名(簡化版) */
function extFromContentType(ct = "") {
if (ct.includes("pdf")) return ".pdf";
if (ct.includes("msword")) return ".doc";
if (ct.includes("officedocument.wordprocessingml.document")) return ".docx";
if (ct.includes("text/plain")) return ".txt";
if (ct.includes("markdown")) return ".md";
return ".bin";
}

/** 下載遠端檔案至暫存(Node 18+ 有 fetch) */
export async function downloadToTemp(url, outDir = "outputs/downloads") {
ensureDir(outDir);
const res = await fetch(url);
if (!res.ok) throw new Error(下載失敗:${res.status} ${res.statusText});
const buf = Buffer.from(await res.arrayBuffer());
const ct = res.headers.get("content-type") || "";
const ext = extFromContentType(ct);
const fp = path.join(outDir, dl_${Date.now()}${ext});
fs.writeFileSync(fp, buf);
return fp;
}

/** 讀純文字(.txt/.md) */
function readTxtLike(filePath) {
return fs.readFileSync(filePath, "utf-8");
}

/** 讀 PDF → text */
async function readPdf(filePath) {
const buf = fs.readFileSync(filePath);
const data = await pdfParse(buf);
return data.text || "";
}

/** 讀 DOCX → text */
async function readDocx(filePath) {
const buf = fs.readFileSync(filePath);
const { value } = await mammoth.extractRawText({ buffer: buf });
return value || "";
}

/** 依副檔名讀取文字內容 */
export async function readTextFile(filePath) {
const ext = path.extname(filePath).toLowerCase();
if (ext === ".txt" || ext === ".md") return readTxtLike(filePath);
if (ext === ".pdf") return await readPdf(filePath);
if (ext === ".docx") return await readDocx(filePath);
throw new Error(不支援的檔案格式:${ext});
}

/** 輸出檔案:JSON/MD */
export function writeJson(filePath, obj) {
fs.writeFileSync(filePath, JSON.stringify(obj, null, 2), "utf-8");
}
export function writeText(filePath, text) {
fs.writeFileSync(filePath, text, "utf-8");
}

  1. src/day11_doc_summarizer.js(新增)
    // src/day11_doc_summarizer.js
    import path from "path";
    import { openai } from "./aiClient.js";
    import { ensureDir, downloadToTemp, readTextFile, writeJson, writeText } from "./utils/io.js";

/** 粗略切塊(以字元數近似 token,預設每塊 ~1200 字) */
function chunkText(text, chunkSize = 1200, overlap = 120) {
const chunks = [];
let i = 0;
while (i < text.length) {
const end = Math.min(text.length, i + chunkSize);
chunks.push(text.slice(i, end));
i = end - overlap; // 保留重疊避免斷句過硬
if (i < 0) i = 0;
}
return chunks;
}

/** 單塊摘要(map 階段) */
async function summarizeChunk(chunk, opts) {
const { tone = "professional", length = "medium" } = opts || {};
const res = await openai.chat.completions.create({
model: "gpt-4o-mini",
temperature: 0.3,
messages: [
{
role: "system",
content:
"你是嚴謹的中文技術編輯。請以重點清單 + 2~3 句摘要回覆,不可虛構內容。",
},
{
role: "user",
content: 請摘要以下內容。語氣:${tone}。長度:${length}。\n\n + chunk,
},
],
});
return res.choices?.[0]?.message?.content?.trim() || "";
}

/** reduce 階段:彙整所有塊的摘要成一份高階摘要+重點清單 */
async function reduceSummaries(summaries, opts) {
const { tone = "professional", length = "medium" } = opts || {};
const joined = summaries.map((s, i) => # 小節${i + 1}\n${s}).join("\n\n");
const res = await openai.chat.completions.create({
model: "gpt-4o-mini",
temperature: 0.3,
messages: [
{
role: "system",
content:
"你是嚴謹的中文總編輯。整合所有小節摘要,產出:\n1) TL;DR(3~5 句)\n2) Outline(6~12 條)\n3) KeyPoints(5~10 條)\n4) ActionItems(可執行清單,若無則空陣列)\n5) Questions(讀者可能想追問的 3~6 題)\n請以純 JSON 回覆,格式:{"tldr":"...","outline":[...],"keyPoints":[...],"actionItems":[...],"questions":[...]}\n內容不得虛構。",
},
{
role: "user",
content: 語氣:${tone},長度:${length}。\n以下為各小節摘要,請彙整:\n\n${joined},
},
],
});
const raw = res.choices?.[0]?.message?.content?.trim() || "{}";
const json = raw.match(/(?:json)?\s*([\s\S]*?)/i)?.[1] ?? raw;
return JSON.parse(json);
}

/** 從原文抓出可能適合引用的金句(避免虛構) */
async function extractQuotes(original, limit = 5) {
const res = await openai.chat.completions.create({
model: "gpt-4o-mini",
temperature: 0.2,
messages: [
{
role: "system",
content:
"從原文中挑選最多 5 句適合直接引用的『原句』,不可改寫;若沒有合適句子可少於 5。",
},
{
role: "user",
content: 請由以下原文挑選金句(以 JSON 陣列回覆):\n${original.slice(0, 16000)},
},
],
});
const raw = res.choices?.[0]?.message?.content?.trim() || "[]";
const json = raw.match(/(?:json)?\s*([\s\S]*?)/i)?.[1] ?? raw;
try { return JSON.parse(json); } catch { return []; }
}

/**

  • 主流程:讀檔→切塊→map→reduce→輸出
  • @param {object} opts
  • @param {string} [opts.filePath] - 本地檔案
  • @param {string} [opts.url] - 遠端 URL
  • @param {number} [opts.chunkSize=1200]
  • @param {number} [opts.overlap=120]
  • @param {("short"|"medium"|"long")} [opts.length="medium"]
  • @param {string} [opts.tone="professional"]
    */
    export async function summarizeDocument(opts = {}) {
    let { filePath, url, chunkSize = 1200, overlap = 120, length = "medium", tone = "professional" } = opts;

// 1) 取得檔案本地路徑
if (!filePath && !url) throw new Error("請提供 filePath 或 url 其一。");
if (!filePath && url) filePath = await downloadToTemp(url);

// 2) 讀取文件文字
const fullText = (await readTextFile(filePath)).trim();
if (!fullText) throw new Error("文件內容為空或無法解析。");

// 3) 切塊 & map
const chunks = chunkText(fullText, chunkSize, overlap);
const perChunkSummaries = [];
for (const c of chunks) {
const s = await summarizeChunk(c, { tone, length });
perChunkSummaries.push(s);
}

// 4) reduce 彙整
const merged = await reduceSummaries(perChunkSummaries, { tone, length });

// 5) 金句(直接取原文)
const quotes = await extractQuotes(fullText, 5);

// 6) 組裝輸出
const titleGuess = path.basename(filePath);
const result = {
title: titleGuess,
wordCount: fullText.length,
chunks: chunks.length,
tldr: merged.tldr || "",
outline: merged.outline || [],
keyPoints: merged.keyPoints || [],
actionItems: merged.actionItems || [],
questions: merged.questions || [],
quotes,
createdAt: new Date().toISOString(),
};

// 7) 落檔
const outDir = path.join("outputs", "docs");
ensureDir(outDir);
const stamp = Date.now();
const jsonPath = path.join(outDir, summary_${stamp}.json);
const mdPath = path.join(outDir, summary_${stamp}.md);

writeJson(jsonPath, result);
writeText(
mdPath,
[
# ${result.title},
, `- 產出時間:${result.createdAt}`, `- 字數:約 ${result.wordCount}`, `- 分塊數:${result.chunks}`, ,
## TL;DR,
result.tldr || "(無)",
, `## Outline`, ...(result.outline?.map((o, i) => `${i + 1}. ${o}`) || ["(無)"]), ,
## Key Points,
...(result.keyPoints?.map((o) => - ${o}) || ["(無)"]),
, `## Action Items`, ...(result.actionItems?.map((o) => `- [ ] ${o}`) || ["- (無)"]), ,
## Questions,
...(result.questions?.map((o) => - ${o}) || ["- (無)"]),
, `## 引用金句`, ...(result.quotes?.map((q) => `> ${q}`) || ["> (無)"]), ,
].join("\n")
);

return { jsonPath, mdPath, meta: result };
}

  1. index.js(修改:加入 docsum 入口)
    // index.js(只示範新增片段,保留你原有分支)
    import { summarizeDocument } from "./src/day11_doc_summarizer.js";

// ...前略(既有 args 解析與其他 task)

async function main() {
const task = args.task || "chat";

if (task === "docsum") {
const filePath = args.filePath || null;
const url = args.url || null;
const length = args.length || "medium"; // short | medium | long
const tone = args.tone || "professional"; // friendly | professional
const chunkSize = args.chunkSize ? Number(args.chunkSize) : 1200;
const overlap = args.overlap ? Number(args.overlap) : 120;

const out = await summarizeDocument({ filePath, url, length, tone, chunkSize, overlap });
console.log("\n=== 文件摘要完成 ===");
console.log("- JSON:", out.jsonPath);
console.log("- Markdown:", out.mdPath);
console.log("\nTL;DR:\n", out.meta.tldr);

// ...其餘 task 分支維持不變
} else {
// 既有的 else 分支省略
}
}

main().catch((e) => {
console.error("發生錯誤:", e.message);
process.exit(1);
});

  1. package.json(新增 Scripts)
    {
    "scripts": {
    "day11:txt": "node index.js --task docsum --filePath sample/article.md --length medium --tone professional",
    "day11:pdf": "node index.js --task docsum --filePath sample/whitepaper.pdf --length short --tone friendly",
    "day11:url": "node index.js --task docsum --url https://example.com/sample.pdf --length long --tone professional"
    }
    }

▶️ 如何執行(CLI)

讀本地 PDF

npm run day11:pdf --silent

讀本地 TXT/MD

npm run day11:txt --silent

直接貼 URL(會先下載到 outputs/downloads/ 再處理)

npm run day11:url --silent

完成後你會看到:

outputs/docs/summary_17265xxxxx.json
outputs/docs/summary_17265xxxxx.md


上一篇
多模態應用整合!
下一篇
AI 翻譯器(多語、術語表、品質檢查)
系列文
練習AI16
圖片
  熱門推薦
圖片
{{ item.channelVendor }} | {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

尚未有邦友留言

立即登入留言